import numpy as np
import pandas as pd
### read all txt files - Bible, Quran and Torah
def read_books(book,file):
with open(file) as fp:
sentences = fp.readlines()
sentences = [x.strip() for x in sentences]
print('%d sentences loaded, avg. len of %d' % (len(sentences), np.mean([len(d.split()) for d in sentences])))
return sentences
bible_txt = read_books('Bible','bert_sentences_bible.txt')
quran_txt = read_books('Quran','bert_sentences_quran.txt')
torah_txt = read_books('Torah','bert_sentences_torah.txt')
31103 sentences loaded, avg. len of 25 6236 sentences loaded, avg. len of 26 5852 sentences loaded, avg. len of 26
#matrix: dot product of the vector embeddings of the quran and torah (Q dot T.Transpose) / each vector was represented by 1024D
matrix_quran_torah = pd.read_csv('fout_quran_torah_matrix.csv',header=None)
print(matrix_quran_torah)
0 1 2 3 4 5 6 \
0 148.9715 146.2907 131.1955 149.8157 138.9613 157.4328 159.7639
1 186.9323 185.2500 157.8213 186.8734 171.7242 200.9876 202.5018
2 174.4907 163.5550 149.2671 175.4482 161.0228 179.5805 184.1349
3 203.7982 175.2905 157.1610 197.6879 176.8983 194.7116 194.9597
4 165.4960 167.2928 146.3596 176.9104 163.0966 184.9396 185.4203
... ... ... ... ... ... ... ...
6231 214.7056 181.6996 160.9895 203.3055 180.3280 201.2010 207.8963
6232 233.4554 206.2906 179.2607 227.4013 199.8947 226.7685 231.1902
6233 227.9043 212.0476 180.3873 233.2744 206.5203 234.0838 240.9375
6234 200.7084 192.9666 165.9750 206.7326 183.7348 210.7775 212.4190
6235 192.9361 167.1532 146.1413 186.2831 167.9522 187.0510 193.3309
7 8 9 ... 5842 5843 5844 \
0 155.8478 159.9281 180.5867 ... 179.6809 158.4329 161.1557
1 197.2223 202.5752 233.8762 ... 232.2169 197.6743 199.1541
2 187.8764 185.1517 217.1813 ... 214.4731 181.9475 173.2883
3 217.7127 200.9009 241.6782 ... 250.2603 185.4324 195.2992
4 186.6289 192.3787 221.9086 ... 210.1691 197.9277 167.9584
... ... ... ... ... ... ... ...
6231 219.4134 206.5460 252.9099 ... 262.4917 192.4416 198.4594
6232 242.5501 232.7423 283.0539 ... 287.5822 219.0624 223.0004
6233 248.7643 243.6688 297.0461 ... 301.1655 234.3270 223.9916
6234 217.2661 217.3523 257.3646 ... 258.0340 208.8662 198.7516
6235 203.4664 193.2063 238.2493 ... 244.2432 183.7894 179.0010
5845 5846 5847 5848 5849 5850 5851
0 139.8407 168.2515 165.8752 167.9160 186.5341 173.3833 155.2176
1 177.6949 218.0385 211.9155 211.5829 235.3587 221.0415 191.1636
2 158.0493 204.5385 191.9243 188.7698 220.0307 205.2636 177.4517
3 175.0897 233.6989 221.9762 199.3540 245.8892 218.1908 191.9412
4 170.9216 208.7942 195.0907 194.5288 226.3548 205.8090 178.0538
... ... ... ... ... ... ... ...
6231 176.6817 241.7466 224.2919 206.2259 254.5083 228.7937 196.4136
6232 199.8705 268.8631 250.2974 234.4471 286.1117 256.1010 221.3855
6233 212.1679 288.1273 263.9082 244.9843 298.1145 268.9149 228.6376
6234 189.0465 244.4410 229.2130 216.2914 260.3316 234.3937 203.3525
6235 170.1119 229.8301 214.2509 194.1837 242.0626 216.5669 186.8974
[6236 rows x 5852 columns]
import seaborn as sns
import matplotlib.pyplot as plt
def viz_sim_matrix_from_to(start_iloc,end_iloc, sim_df, text_label_1, text_label_2):
# Plot heatmatrix for sentences at df.iloc[start:end]
fig, ax = plt.subplots(figsize=(25,14))
sim_columns = list(sim_df.columns)
ax = sns.heatmap(sim_df.iloc[start_iloc:end_iloc][sim_columns[start_iloc:end_iloc]])
ax.axes.set_title("Similarity matrix for the verses at positions {}:{}] in the dataset {} and {}" .format(start_iloc, end_iloc, text_label_1, text_label_2),)
#heatmap
viz_sim_matrix_from_to(0,100, matrix_quran_torah, "Quran", "Torah")
viz_sim_matrix_from_to(2500,2600, matrix_quran_torah, "Quran", "Torah")
viz_sim_matrix_from_to(0,5852, matrix_quran_torah, "Quran", "Torah")
#Examples
print(quran_txt[10], "\n") #rows
print(torah_txt[10]) #columns
And who believe in the Revelation sent to thee, and sent before thy time, and (in their hearts) have the assurance of the Hereafter. And God said, Let the earth bring forth grass, the herb yielding seed, and the fruit tree yielding fruit after his kind, whose seed is in itself, upon the earth: and it was so.
matrix_quran_bible = pd.read_csv('fout_quran_bible_matrix.csv',header=None)
print(matrix_quran_bible)
0 1 2 3 4 5 6 \
0 148.9715 146.2907 131.1955 149.8157 138.9613 157.4328 159.7639
1 186.9323 185.2500 157.8213 186.8734 171.7242 200.9876 202.5018
2 174.4907 163.5550 149.2671 175.4482 161.0228 179.5805 184.1349
3 203.7982 175.2905 157.1610 197.6879 176.8983 194.7116 194.9597
4 165.4960 167.2928 146.3596 176.9104 163.0966 184.9396 185.4203
... ... ... ... ... ... ... ...
6231 214.7056 181.6996 160.9895 203.3055 180.3280 201.2010 207.8963
6232 233.4554 206.2906 179.2607 227.4013 199.8947 226.7685 231.1902
6233 227.9043 212.0476 180.3873 233.2744 206.5203 234.0838 240.9375
6234 200.7084 192.9666 165.9750 206.7326 183.7348 210.7775 212.4190
6235 192.9361 167.1532 146.1413 186.2831 167.9522 187.0510 193.3309
7 8 9 ... 31093 31094 31095 \
0 155.8478 159.9281 180.5867 ... 181.7611 150.0479 191.8130
1 197.2223 202.5752 233.8762 ... 231.6377 187.6002 244.6747
2 187.8764 185.1517 217.1813 ... 220.8866 176.4334 235.2669
3 217.7127 200.9009 241.6782 ... 242.6877 197.1161 266.9855
4 186.6289 192.3787 221.9086 ... 234.1541 166.0937 238.1499
... ... ... ... ... ... ... ...
6231 219.4134 206.5460 252.9099 ... 251.9230 209.7871 272.8174
6232 242.5501 232.7423 283.0539 ... 280.7609 223.9512 301.1482
6233 248.7643 243.6688 297.0461 ... 301.3262 226.5438 322.8993
6234 217.2661 217.3523 257.3646 ... 258.7450 201.7532 278.7354
6235 203.4664 193.2063 238.2493 ... 238.2878 187.2882 257.0445
31096 31097 31098 31099 31100 31101 31102
0 165.8874 185.2883 154.4005 171.8932 180.7553 164.3891 151.4600
1 212.6985 235.8118 196.5409 217.5966 232.2964 198.3055 175.1651
2 204.0228 221.7737 181.9492 200.4098 210.4559 193.5984 174.5334
3 217.2755 249.6590 200.0320 217.3571 238.9611 202.7377 176.0633
4 202.2580 228.5913 194.7150 212.4823 215.7086 198.1336 167.4373
... ... ... ... ... ... ... ...
6231 231.1496 248.9402 205.2610 220.9153 245.4803 196.0800 172.9991
6232 252.0421 283.4798 229.6710 250.7713 273.3202 227.6276 197.2437
6233 277.6270 298.1970 250.2699 272.7235 291.3109 240.8209 193.5904
6234 239.8492 256.7302 220.6978 239.1208 254.0856 213.4781 175.9973
6235 228.9045 235.9216 196.2799 210.4843 228.9533 187.6367 155.7368
[6236 rows x 31103 columns]
viz_sim_matrix_from_to(0,100, matrix_quran_bible, "Quran", "Bible")
viz_sim_matrix_from_to(2500,2600, matrix_quran_bible, "Quran", "Bible")
viz_sim_matrix_from_to(0,31103, matrix_quran_bible, "Quran", "Bible")
#Examples
print(quran_txt[10], "\n") #rows
print(bible_txt[10]) #columns
And who believe in the Revelation sent to thee, and sent before thy time, and (in their hearts) have the assurance of the Hereafter. And God said, Let the earth bring forth grass, the herb yielding seed, and the fruit tree yielding fruit after his kind, whose seed is in itself, upon the earth: and it was so.
matrix_torah_bible = pd.read_csv('fout_torah_bible_matrix.csv',header=None)
print(matrix_torah_bible)
0 1 2 3 4 5 6 \
0 218.2175 175.2298 152.3978 191.7979 172.5225 189.3326 194.4987
1 175.2298 216.9501 150.5864 192.6992 181.0281 206.1885 210.7572
2 152.3978 150.5864 173.6630 173.1381 151.3174 167.3538 167.1495
3 191.7979 192.6992 173.1381 227.2607 189.4279 205.2323 212.4757
4 172.5225 181.0281 151.3174 189.4279 198.6628 187.4262 195.6252
... ... ... ... ... ... ... ...
5847 205.0522 202.1181 167.5710 210.7375 196.3705 214.9273 223.6349
5848 189.5864 203.3796 164.4083 207.9727 191.1993 214.0191 224.0728
5849 222.5526 226.2712 184.5502 237.3993 216.5716 243.8556 252.2778
5850 206.1529 214.9547 169.5507 220.1923 202.1722 230.6384 239.1562
5851 181.2745 191.7635 152.7451 194.8535 175.3073 197.3226 203.9465
7 8 9 ... 31093 31094 31095 \
0 203.9536 191.9844 230.0881 ... 210.4883 183.5241 234.5112
1 192.1131 207.2825 238.1096 ... 214.6474 160.0323 226.3087
2 165.2209 175.4805 188.9244 ... 182.9828 141.9539 188.8701
3 207.8296 211.6494 248.6100 ... 231.7099 178.5502 244.6675
4 203.0146 195.6049 229.2434 ... 210.9067 163.0172 220.1759
... ... ... ... ... ... ... ...
5847 224.0961 225.8714 263.9844 ... 254.7079 186.9558 272.5290
5848 207.9355 227.0423 260.0092 ... 252.5897 177.5586 263.2180
5849 245.3065 256.0001 299.8824 ... 302.3203 210.6899 312.3801
5850 224.0466 240.2865 280.7782 ... 276.3078 194.9423 289.2024
5851 195.4632 207.6760 239.4942 ... 232.6567 164.6202 243.9255
31096 31097 31098 31099 31100 31101 31102
0 191.6821 224.4020 177.2850 196.1647 214.9367 178.0158 161.2753
1 198.4500 216.7847 186.2505 207.7735 222.5323 181.4310 149.8268
2 159.6588 179.0682 162.1531 172.5765 177.8409 161.6682 140.8399
3 208.0991 232.3176 198.2452 215.3579 231.9392 192.6848 161.7186
4 190.1617 210.9738 180.0349 197.2220 208.9609 174.5367 147.0330
... ... ... ... ... ... ... ...
5847 234.4896 259.1095 212.0592 239.8911 255.4747 211.1442 174.2437
5848 224.0646 254.0210 213.2525 242.9367 250.1089 211.4447 172.5734
5849 272.9689 299.0027 246.7089 284.8788 292.1589 248.6749 194.1236
5850 248.8914 273.3427 226.7402 262.9929 277.2185 226.4602 185.2283
5851 210.6295 233.6869 194.7718 223.0295 236.3016 194.0484 162.3172
[5852 rows x 31103 columns]
viz_sim_matrix_from_to(0,100, matrix_torah_bible, "Torah", "Bible")
viz_sim_matrix_from_to(2500,2600, matrix_torah_bible, "Torah", "Bible")
viz_sim_matrix_from_to(0,31103, matrix_torah_bible, "Torah", "Bible")
#Examples
print(torah_txt[10], "\n") #rows
print(bible_txt[10]) #columns
And God said, Let the earth bring forth grass, the herb yielding seed, and the fruit tree yielding fruit after his kind, whose seed is in itself, upon the earth: and it was so. And God said, Let the earth bring forth grass, the herb yielding seed, and the fruit tree yielding fruit after his kind, whose seed is in itself, upon the earth: and it was so.
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.ticker import StrMethodFormatter
def viz_top_sentences_sim(file, text_label_1, text_label_2):
df = pd.read_csv(file, sep=',',names=['score', 's1', 's2'])
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df.sort_values(by='score', ascending=False).plot.barh(title='Verse Similiarity between {} and {}' .format(text_label_1, text_label_2), color='green', figsize=(10,100))
#print(df.head(3))
return df
def vis_hist_most_freq_verses(df, col_book, col_verses, col_frequency):
df = df.groupby([col_verses])[col_book].agg(col_frequency).to_frame(col_frequency).reset_index()
df[col_verses] = df[col_verses].str.slice(0,5)
x = df[col_verses]
y = df[col_frequency]
plt.figure(figsize=(50,20))
plt.bar(x,y,align='center') # A bar chart
plt.xlabel('Verses')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha="right")
plt.show()
return df
df_tq_0 = viz_top_sentences_sim('fout_torah_quran.csv', 'Quran', 'Torah')
df_tq_1 = vis_hist_most_freq_verses(df_tq_0, 's1', 's2', 'count')
print(df_tq_0.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_1.shape[0], len(torah_txt), "Torah", df_tq_0.shape[0], "Quran"))
print("{}% of the verses in the {} are used to represent all other verses in the {}." .format((df_tq_1.shape[0]/len(torah_txt))*100, "Torah", "Quran"))
count 6236.000000 mean 0.920954 std 0.018214 min 0.804100 25% 0.910400 50% 0.923900 75% 0.934000 max 0.963000 Name: score, dtype: float64 There are 1184 out of 5852 distinct verses from the Torah that are considered to be the most similar to the 6236 verses in the Quran. 20.2323991797676% of the verses in the Torah are used to represent all other verses in the Quran.
df_tq_2 = viz_top_sentences_sim('fout_quran_torah.csv', 'Torah', 'Quran')
df_tq_3 = vis_hist_most_freq_verses(df_tq_2, 's1', 's2', 'count')
print(df_tq_2.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_3.shape[0], len(quran_txt), "Quran", df_tq_2.shape[0], "Torah"))
print("{}% of the verses in the {} are used to represent all other verses in the {}."
.format((df_tq_3.shape[0]/len(quran_txt))*100, "Quran", "Torah"))
count 5852.000000 mean 0.920837 std 0.019368 min 0.719400 25% 0.910800 50% 0.924300 75% 0.934425 max 0.963000 Name: score, dtype: float64 There are 1120 out of 6236 distinct verses from the Quran that are considered to be the most similar to the 5852 verses in the Torah. 17.96023091725465% of the verses in the Quran are used to represent all other verses in the Torah.
df_tq_4 = viz_top_sentences_sim('fout_torah_bible.csv', 'Bible', 'Torah')
df_tq_5 = vis_hist_most_freq_verses(df_tq_4, 's1', 's2', 'count')
print(df_tq_4.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_5.shape[0], len(torah_txt), "Torah", df_tq_4.shape[0], "Bible"))
print("{}% of the verses in the {} are used to represent all other verses in the {}."
.format((df_tq_5.shape[0]/len(torah_txt))*100, "Torah", "Bible"))
count 31103.000000 mean 0.943023 std 0.031824 min 0.714300 25% 0.923500 50% 0.936800 75% 0.951400 max 1.000000 Name: score, dtype: float64 There are 5719 out of 5852 distinct verses from the Torah that are considered to be the most similar to the 31103 verses in the Bible. 97.72727272727273% of the verses in the Torah are used to represent all other verses in the Bible.
df_tq_6 = viz_top_sentences_sim('fout_bible_torah.csv', 'Torah', 'Bible')
df_tq_7 = vis_hist_most_freq_verses(df_tq_6, 's1', 's2', 'count')
print(df_tq_6.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_7.shape[0], len(bible_txt), "Bible", df_tq_6.shape[0], "Torah"))
print("{}% of the verses in the {} are used to represent all other verses in the {}."
.format((df_tq_7.shape[0]/len(bible_txt))*100, "Bible", "Torah"))
count 5852.0 mean 1.0 std 0.0 min 1.0 25% 1.0 50% 1.0 75% 1.0 max 1.0 Name: score, dtype: float64 There are 5719 out of 31103 distinct verses from the Bible that are considered to be the most similar to the 5852 verses in the Torah. 18.387293830177153% of the verses in the Bible are used to represent all other verses in the Torah.
df_tq_8 = viz_top_sentences_sim('fout_quran_bible.csv', 'Bible', 'Quran')
df_tq_9 = vis_hist_most_freq_verses(df_tq_8, 's1', 's2', 'count')
print(df_tq_8.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_9.shape[0], len(quran_txt), "Quran", df_tq_8.shape[0], "Bible"))
print("{}% of the verses in the {} are used to represent all other verses in the {}."
.format((df_tq_9.shape[0]/len(quran_txt))*100, "Quran", "Bible"))
count 31103.000000 mean 0.921541 std 0.019240 min 0.697200 25% 0.911500 50% 0.924800 75% 0.935100 max 0.965900 Name: score, dtype: float64 There are 2739 out of 6236 distinct verses from the Quran that are considered to be the most similar to the 31103 verses in the Bible. 43.92238614496472% of the verses in the Quran are used to represent all other verses in the Bible.
df_tq_10 = viz_top_sentences_sim('fout_bible_quran.csv', 'Quran', 'Bible')
df_tq_11 = vis_hist_most_freq_verses(df_tq_10, 's1', 's2', 'count')
print(df_tq_10.score.describe())
print("\nThere are {} out of {} distinct verses from the {} that are considered to be the most similar to the {} verses in the {}.\n"
.format(df_tq_11.shape[0], len(bible_txt), "Bible", df_tq_10.shape[0], "Quran"))
print("{}% of the verses in the {} are used to represent all other verses in the {}."
.format((df_tq_11.shape[0]/len(bible_txt))*100, "Bible", "Quran"))
count 6236.000000 mean 0.928232 std 0.016433 min 0.820700 25% 0.918600 50% 0.930700 75% 0.940000 max 0.965900 Name: score, dtype: float64 There are 2744 out of 31103 distinct verses from the Bible that are considered to be the most similar to the 6236 verses in the Quran. 8.822300099668842% of the verses in the Bible are used to represent all other verses in the Quran.